Introduction

Introduction

Oral cancer, a major subtype of head and neck cancers, remains one of the most aggressive malignancies globally. Oral squamous cell carcinoma (OSCC) often arises from premalignant lesions such as leukoplakia and progresses through intermediate pathological stages, including dysplasia and hyperkeratosis with no recurrence (HkNR), before evolving into invasive cancer. Despite therapeutic advancements, early detection remains a challenge, contributing to a persistently low 5-year survival rate.

In this study, we analyze transcriptomic data from the GEO dataset GSE227919, which includes samples from healthy controls, dysplasia, HkNR, and cancer tissues. By performing differential expression analysis across these sequential stages—Dysplasia vs Control, HkNR vs Dysplasia, and Cancer vs HkNR—we aim to capture key transcriptional changes along the disease trajectory. Enrichment analyses, including g:Profiler and GSEA (Gene Set Enrichment Analysis), are employed to identify dysregulated pathways. This approach enables us to explore the molecular landscape of oral cancer progression and potentially highlight targets for early intervention and therapeutic development.

1. Load Packages and Data

library(tidyverse)
library(limma)
library(edgeR)
library(pheatmap)
library(plotly)
library(ggplot2)
library(dplyr)
library(VennDiagram)
library(gprofiler2)
library(clusterProfiler)
library(msigdbr)
library(ggvenn)
library(DT)
library(knitr)
library(purrr)


expression_data <- read.table("project_data.txt", header = TRUE, row.names = 1) %>% as.matrix()
metadata <- read.csv("project_metadata.csv")

2. Preprocessing

common_samples <- intersect(colnames(expression_data), metadata$sample)
expr_mat <- expression_data[, common_samples]
meta4 <- metadata %>% filter(sample %in% common_samples) %>% arrange(match(sample, common_samples))
rownames(meta4) <- meta4$sample
meta4$Stage <- factor(meta4$group, levels = c("Control", "Dysplasia", "HkNR", "Cancer"), ordered = TRUE)
log_expr <- log2(expr_mat + 1)

3. PCA and Variance Plots

pca <- prcomp(t(log_expr), scale. = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Stage <- meta4$Stage

ggplot(pca_df, aes(x = PC1, y = PC2, color = Stage)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA of Samples")

4. limma-voom Differential Expression

design4 <- model.matrix(~0 + Stage, data = meta4)
colnames(design4) <- levels(meta4$Stage)
dge <- DGEList(counts = expr_mat)
dge <- calcNormFactors(dge)
v <- voom(dge, design4, plot = FALSE)
fit <- lmFit(v, design4)

cont4 <- makeContrasts(
  Dys_vs_Control = Dysplasia - Control,
  HkNR_vs_Dys    = HkNR - Dysplasia,
  Canc_vs_HkNR   = Cancer - HkNR,
  Canc_vs_Control = Cancer - Control,
  levels = design4
)

fit2 <- contrasts.fit(fit, cont4) %>% eBayes()

de_Dys_vs_Control <- topTable(fit2, coef = "Dys_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_HkNR_vs_Dys    <- topTable(fit2, coef = "HkNR_vs_Dys", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_HkNR   <- topTable(fit2, coef = "Canc_vs_HkNR", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Control   <- topTable(fit2, coef = "Canc_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")

4A. Top DEG Tables

get_top_deg_table <- function(de_df, contrast_label) {
  de_df <- de_df %>%
    mutate(Direction = case_when(
      adj.P.Val < 0.05 & logFC > 1  ~ "Upregulated",
      adj.P.Val < 0.05 & logFC < -1 ~ "Downregulated",
      TRUE                          ~ "Not Significant"
    )) %>%
    filter(Direction %in% c("Upregulated", "Downregulated")) %>%
    arrange(desc(abs(logFC))) %>%
    slice(1:20)

  cat(paste0("### Top 20 DEGs: ", contrast_label, "\n\n"))
  datatable(de_df, options = list(pageLength = 10), rownames = FALSE)
}

get_top_deg_table(de_Dys_vs_Control, "Dysplasia vs Control")

Top 20 DEGs: Dysplasia vs Control

get_top_deg_table(de_HkNR_vs_Dys, "HkNR vs Dysplasia")

Top 20 DEGs: HkNR vs Dysplasia

get_top_deg_table(de_Canc_vs_HkNR, "Cancer vs HkNR")

Top 20 DEGs: Cancer vs HkNR

get_top_deg_table(de_Canc_vs_Control, "Cancer vs Control")

Top 20 DEGs: Cancer vs Control

5. Volcano Plots

plot_volcano<- function(df, title = "Volcano plot") {
  
  df <- df %>% 
    mutate(
      logP = -log10(adj.P.Val + 1e-300),
      Direction = case_when(
        adj.P.Val < 0.05 & logFC >  1  ~ "Up",
        adj.P.Val < 0.05 & logFC < -1  ~ "Down",
        TRUE                            ~ "NS"
      )
    )
  
  ggplot(df, aes(x = logFC, y = logP, colour = Direction)) +
    geom_point(alpha = 0.7, size = 1.5) +
    scale_colour_manual(values = c(Up = "red", Down = "blue", NS = "grey")) +
    geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
    labs(
      title  = title,
      x      = "log2FC",
      y      = "-log10(FDR)",
      colour = "Direction"
    ) +
    theme_minimal(base_size = 12)
}

plot_volcano(de_Dys_vs_Control, "Volcano: Dysplasia vs Control")

plot_volcano(de_HkNR_vs_Dys,    "Volcano: HkNR vs Dysplasia")

plot_volcano(de_Canc_vs_HkNR,   "Volcano: Cancer vs HkNR")

plot_volcano(de_Canc_vs_Control,   "Volcano: Cancer vs Control")

6. Venn Diagram of Significant Genes

genes1 <- de_Dys_vs_Control %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes2 <- de_HkNR_vs_Dys %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes3 <- de_Canc_vs_HkNR %>% filter(adj.P.Val < 0.05) %>% pull(geneID)

venn_list <- list(
  "Dys vs Control" = genes1,
  "HkNR vs Dys"    = genes2,
  "Cancer vs HkNR" = genes3
)

ggvenn(venn_list, fill_color = c("red", "green", "blue"), text_size = 4, set_name_size = 5)

7. Heatmap of Top DEGs

top20 <- de_Dys_vs_Control %>% arrange(adj.P.Val) %>% slice_head(n = 20) %>% pull(geneID)
heat_mat <- log_expr[top20, ]
ord_samps <- meta4 %>% arrange(Stage) %>% pull(sample)
ann <- data.frame(Stage = meta4$Stage)
rownames(ann) <- meta4$sample

pheatmap(heat_mat[, ord_samps], annotation_col = ann[ord_samps,, drop = FALSE],
         cluster_rows = TRUE, cluster_cols = FALSE, show_rownames = TRUE,
         main = "Top DEGs Across Stages")

8. GO Enrichment (g:Profiler)

sig_genes <- genes3
gostres <- gost(query = sig_genes, organism = "hsapiens", correction_method = "fdr")
gostplot(gostres, interactive = TRUE, capped = TRUE)

9. Gene Expression Trajectories

top_genes <- unique(c(
  de_Dys_vs_Control %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
  de_HkNR_vs_Dys %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
  de_Canc_vs_HkNR %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID)
))

expr_long <- log_expr[top_genes, ] %>% as.data.frame() %>%
  rownames_to_column("Gene") %>%
  pivot_longer(-Gene, names_to = "sample", values_to = "logExpr") %>%
  left_join(meta4, by = "sample")

avg_expr <- expr_long %>% group_by(Gene, Stage) %>% summarise(meanExpr = mean(logExpr), .groups = "drop")

ggplot(avg_expr, aes(x = Stage, y = meanExpr, group = 1)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(size = 2) +
  facet_wrap(~ Gene, scales = "free_y") +
  theme_minimal(base_size = 13) +
  labs(title = "Gene-wise Expression Trajectories", y = "Mean log2(Expression + 1)", x = "Stage")

10. GSEA (MSigDB C2)

library(BiocParallel)
BPPARAM <- SerialParam()  # Single line fix for parallel issues

hs_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
  select(gs_name, gene_symbol)

rank_and_run_gsea <- function(de_df, label) {
  ranked <- deframe(de_df[, c("geneID", "logFC")]) %>% sort(decreasing = TRUE)
  GSEA(ranked, TERM2GENE = hs_c2, verbose = FALSE, BPPARAM = BPPARAM)@result %>%
    transmute(ID, Description, NES, p.adjust, Stage = label)
}

df_gsea <- bind_rows(
  rank_and_run_gsea(de_Dys_vs_Control, "Dysplasia vs Control"),
  rank_and_run_gsea(de_HkNR_vs_Dys,    "HkNR vs Dysplasia"),
  rank_and_run_gsea(de_Canc_vs_HkNR,   "Cancer vs HkNR")
)

top_terms <- df_gsea %>%
  filter(p.adjust < 0.05) %>%
  group_by(ID) %>%
  summarise(mean_NES = mean(abs(NES)), .groups = "drop") %>%
  slice_max(mean_NES, n = 10) %>%
  pull(ID)

ggplot(df_gsea %>% filter(ID %in% top_terms),
       aes(x = Stage, y = NES, group = ID, color = ID)) +
  geom_line(size = 1) + geom_point(size = 2) +
  theme_minimal(base_size = 13) +
  labs(title = "GSEA: NES Trajectories Across Stages", x = "Comparison", y = "NES") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))